Environment
I made the class for the environment using OpenAI gymnasium and initialized an environment.
class ConnectFourGym(gym.Env):
def __init__(self, agent_policy="random"):
ks_env = make("connectx", debug=True)
self.ks_env = ks_env
self.env = ks_env.train([None, agent_policy])
self.rows = ks_env.configuration.rows
self.columns = ks_env.configuration.columns
self.action_space = spaces.Discrete(self.columns)
self.observation_space = spaces.Box(low=0, high=2, shape=(1, self.rows, self.columns), dtype=int)
self.reward_range = (-10, 3)
self.spec = None
self.metadata = None
def reset(self):
self.obs = self.env.reset()
return np.array(self.obs['board']).reshape(1, self.rows, self.columns)
def normalize(self, x):
return 2 * (x + 10000) / (60000) - 1
def change_reward(self, old_reward, done):
if old_reward == 1:
return 3
elif done:
return -3
else:
# if move is inconsequential immediately, give reward as heuristic scaled to [-1, 1]
grid = np.asarray(self.obs['board']).reshape(self.rows, self.columns)
return self.normalize(get_heuristic(grid, self.obs.mark, self.ks_env.configuration))
def step(self, action):
if (self.obs['board'][int(action)] == 0):
self.obs, old_reward, done, _ = self.env.step(int(action))
reward = self.change_reward(old_reward, done)
else:
# if move is invalid, penalize heavily
reward, done, _ = -10, True, {}
return np.array(self.obs['board']).reshape(1, self.rows, self.columns), reward, done, _
env = ConnectFourGym(agent_policy=alphabeta_agent)
alphabeta_agent
is the analytical agent built in another notebook and explained in this page.
get_heuristic()
is the function for scoring the position through a heuristic described here.
Essentially, the environment is made using the alphabeta_agent
as the opponent to train against and the heuristic is used to make the reward function. Reward is normalized to between -1 and 1 except in the cases of termination of the game:
-
Win gets 3 points
-
Loss gets -3 points
-
Invalid move is penalized heavily with -10 points